options(warn=-2)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')
## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')
## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)

1. Cargamos el dataset

1.1. Cargamos el dataset nasa-asteroids y excluirnos observaciones con valores faltaste.

dataset <- loadcsv('../datasets/nasa.csv')
str(dataset)
## 'data.frame':    4687 obs. of  40 variables:
##  $ Neo.Reference.ID            : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Name                        : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.KM.min.          : num  0.1272 0.1461 0.2315 0.0088 0.1272 ...
##  $ Est.Dia.in.KM.max.          : num  0.2845 0.3266 0.5177 0.0197 0.2845 ...
##  $ Est.Dia.in.M.min.           : num  127.2 146.1 231.5 8.8 127.2 ...
##  $ Est.Dia.in.M.max.           : num  284.5 326.6 517.7 19.7 284.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Est.Dia.in.Feet.min.        : num  417.4 479.2 759.5 28.9 417.4 ...
##  $ Est.Dia.in.Feet.max.        : num  933.3 1071.6 1698.3 64.6 933.3 ...
##  $ Close.Approach.Date         : chr  "1995-01-01" "1995-01-01" "1995-01-08" "1995-01-15" ...
##  $ Epoch.Date.Close.Approach   : num  7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Orbiting.Body               : chr  "Earth" "Earth" "Earth" "Earth" ...
##  $ Orbit.ID                    : int  17 21 22 7 25 40 43 22 100 30 ...
##  $ Orbit.Determination.Date    : chr  "2017-04-06 08:36:37" "2017-04-06 08:32:49" "2017-04-06 09:20:19" "2017-04-06 09:15:49" ...
##  $ Orbit.Uncertainity          : int  5 3 0 6 1 1 1 0 0 0 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Equinox                     : chr  "J2000" "J2000" "J2000" "J2000" ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.2. Las siguientes columnas las excluimos ya sea por que son no numerica, o colineales con las columna que si son parte de nuestro analisis.

excluded_columns <- c(
  'Neo.Reference.ID',
  'Name',
  'Close.Approach.Date',
  'Epoch.Date.Close.Approach',
  'Orbit.ID',
  'Orbit.Determination.Date',
  'Orbiting.Body',
  'Est.Dia.in.Feet.min.',
  'Est.Dia.in.Feet.max.',
  'Est.Dia.in.M.min.',
  'Est.Dia.in.M.max.',
  'Est.Dia.in.KM.min.',
  'Est.Dia.in.KM.max.',
  'Equinox',
  'Orbit.Uncertainity',
  'Perihelion.Time'
)

ds_step_1 <- dataset %>% dplyr::select(-excluded_columns) %>% na.omit
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(dataset)

str(ds_step_1)
## 'data.frame':    4687 obs. of  24 variables:
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.3. Tomamos una muestra del 80% del dataset.

La idea de este paso es evitar tener los mismos resultado que otros análisis pre-existentes en kaggle. Para esto ordenamos aleatoria-mente las observaciones con el parámetro shuffle y luego tomamos una muestra del 80% de las observaciones.

c(ds_step_2, any) %<-% train_test_split(
  ds_step_1, 
  train_size = .8, 
  shuffle    = TRUE
)
## [1] "Train Set size: 3749"
## [1] "Test set size: 938"
rm(any)
rm(ds_step_1)

2. Eliminamos las columnas que están altamente co-relacionadas

Excluimos las columnas que tienen un correlación mayor al 80%. Del cada grupo atentamente co-relacionado nos quedamos con una sola variable, ya que todas con muy similares.

high_correlated_columns <- find_high_correlated_columns(
  feat(ds_step_2), 
  cutoff=.9
)
## Compare row 21  and column  15 with corr  0.975 
##   Means:  0.292 vs 0.218 so flagging column 21 
## Compare row 8  and column  9 with corr  1 
##   Means:  0.291 vs 0.21 so flagging column 8 
## Compare row 9  and column  7 with corr  1 
##   Means:  0.255 vs 0.204 so flagging column 9 
## Compare row 7  and column  10 with corr  1 
##   Means:  0.216 vs 0.201 so flagging column 7 
## Compare row 15  and column  12 with corr  0.93 
##   Means:  0.27 vs 0.195 so flagging column 15 
## Compare row 12  and column  23 with corr  0.993 
##   Means:  0.24 vs 0.189 so flagging column 12 
## Compare row 4  and column  5 with corr  1 
##   Means:  0.299 vs 0.178 so flagging column 4 
## Compare row 5  and column  6 with corr  1 
##   Means:  0.253 vs 0.165 so flagging column 5 
## Compare row 3  and column  2 with corr  1 
##   Means:  0.219 vs 0.154 so flagging column 3 
## All correlations <= 0.9
ds_step_3 <- ds_step_2 %>% dplyr::select(-high_correlated_columns[-1])

rm(ds_step_2)
str(ds_step_3)
## 'data.frame':    3749 obs. of  16 variables:
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Miles.per.hour            : num  48093 28253 38598 41366 9717 ...
##  $ Miss.Dist..miles.         : num  42744884 26690324 21643130 44921436 590913 ...
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Epoch.Osculation          : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity              : num  0.24 0.578 0.447 0.464 0.147 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Asc.Node.Longitude        : num  95.2 110.8 123 253.6 321.5 ...
##  $ Orbital.Period            : num  399 665 356 659 415 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Perihelion.Arg            : num  189.7 97.7 299.8 297.8 242.7 ...
##  $ Aphelion.Dist             : num  1.31 2.35 1.42 2.17 1.25 ...
##  $ Mean.Anomaly              : num  333 238 333 289 101 ...
##  $ Mean.Motion               : num  0.903 0.541 1.012 0.546 0.867 ...
##  $ Hazardous                 : chr  "False" "True" "False" "True" ...

3. Transformamos la varaible a predecir a tipo numericas

ds_step_4 <- target_to_num(ds_step_3)

rm(ds_step_3)
str(ds_step_4)
## 'data.frame':    3749 obs. of  16 variables:
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Miles.per.hour            : num  48093 28253 38598 41366 9717 ...
##  $ Miss.Dist..miles.         : num  42744884 26690324 21643130 44921436 590913 ...
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Epoch.Osculation          : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity              : num  0.24 0.578 0.447 0.464 0.147 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Asc.Node.Longitude        : num  95.2 110.8 123 253.6 321.5 ...
##  $ Orbital.Period            : num  399 665 356 659 415 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Perihelion.Arg            : num  189.7 97.7 299.8 297.8 242.7 ...
##  $ Aphelion.Dist             : num  1.31 2.35 1.42 2.17 1.25 ...
##  $ Mean.Anomaly              : num  333 238 333 289 101 ...
##  $ Mean.Motion               : num  0.903 0.541 1.012 0.546 0.867 ...
##  $ Hazardous                 : num  0 1 0 1 0 0 0 0 0 0 ...

4. Seleccionamos las variables

4.1 Feature importance

Tomamos las columnas que mejor separan las clases. Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa las clases.

result <- features_importance(target_to_str(ds_step_4), target = 'Hazardous')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
plot_features_importance(result)

n_best_features = 5
# n_best_features = 15

best_features <- top_acc_features(result, top=n_best_features)
## Selecting by MeanDecreaseGini
best_features
## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"        
## [3] "Est.Dia.in.Miles.min."      "Perihelion.Distance"       
## [5] "Inclination"
length(best_features)
## [1] 5
ds_step_5 <- ds_step_4 %>% dplyr::select(c(best_features, c(Hazardous)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

4.2 Cluster de variables

En este apartado vamos a internar agrupas las variables que son parecida y luego seleccionar una de cada grupo. Es otra forma de eliminar variables muy correlaciones

scaled_ds_step_4 <- feat(ds_step_4) %>% mutate(~(scale(.) %>% as.vector))

clustering_metrics_plot(scaled_ds_step_4)

kmeans_variable_clusters(feat(ds_step_4), n_clusters = 2)
kmeans_variable_clusters(feat(ds_step_4), n_clusters = 4)

4.3 PCA

En este apartado la idea es seleccionar variables no cor relacionadas. Esto se puede detectar por medio del angulo entre las variables originales. Si el angulo es muy pequeño, entonces las variables están altamente correlacionadas pudiendo seleccionar una de ellas.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3749 individuals, described by 15 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
rm(ds_step_4)
str(ds_step_5)
## 'data.frame':    3749 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.01548 0.01962 0.00237 0.04906 0.00149 ...
##  $ Absolute.Magnitude        : num  25.8 17.7 23.3 19.5 28.1 ...
##  $ Est.Dia.in.Miles.min.     : num  0.01143 0.47633 0.03562 0.20792 0.00396 ...
##  $ Perihelion.Distance       : num  0.806 0.629 0.544 0.795 0.929 ...
##  $ Inclination               : num  0.901 7.789 17.914 32.976 0.855 ...
##  $ Hazardous                 : num  0 1 0 1 0 0 0 0 0 0 ...

5. Filtramos outliers

ds_step_6 <- filter_outliers_m1(ds_step_5, max_score = 0.55)

## [1] "Outliers: 3 %"
## [1] "Dataset without outliers: 97 %"
# ds_step_6 <- filter_outliers_m2(ds_step_5)

rm(ds_step_5)

6. Análisis Exploratorio

6.1. Proporción de classes.

Se puede ver que el numero de casos negativos es mucho mas alto que el numero de casos positivos.

ggplot(
  ds_step_6 %>% group_by(Hazardous) %>% tally(), 
  aes(x=Hazardous, y=n)) +
  geom_bar(stat="identity", width=0.8)

ds_step_6 %>% 
  group_by(Hazardous) %>% 
  tally() %>% 
  dplyr::rename(Count = n) %>%
  mutate(Percent = (Count / nrow(ds_step_6))*100)

6.1. Boxplot comparativos

coparative_boxplot(feat(ds_step_6), to_col=n_best_features)

6.2. Histogramas y densidad

comparative_histplot(feat(ds_step_6), to_col=n_best_features)

6.3. Analizamos gráfico de normalidad univariado

## [[1]]
## [1] 3.7e-24
## 
## [[2]]
## [1] 3.7e-24
## 
## [[3]]
## [1] 3.7e-24
## 
## [[4]]
## [1] 3.7e-24
## 
## [[5]]
## [1] 3.7e-24

Observaciones: Al parecer solo Perihelion.Distance parece ser normal o por lo enos la mas nromal de todas.

6.3. Test de normalidad uni-variado

uni_shapiro_test(feat(ds_step_6))
## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.82248, p-value < 2.2e-16
## 
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98529, p-value < 2.2e-16
## 
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.7019, p-value < 2.2e-16
## 
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98546, p-value < 2.2e-16
## 
## [1] "=> Variable: Inclination"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.90036, p-value < 2.2e-16

Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo Perihelion.Distance que parece tender anormalidad.

6.4. Test de normalidad muti-variado

mult_shapiro_test(feat(ds_step_6))
## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.62616, p-value < 2.2e-16

Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado de los tests de shapiro uni-variados y los qqplot’s.

6.5. Test de homocedasticidad multi-variado

multi_boxm_test(feat(ds_step_6), target(ds_step_6))
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  features
## Chi-Sq (approx.) = 2635, df = 15, p-value < 2.2e-16

Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables no son homocedasticas.

6.6. Correlaciones entre variables

plot_correlations(feat(ds_step_6))

6.7. Análisis completo

6.8. PCA: Comparación de variables originales con/sin la variable a predecir.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3642 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 3642 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

6.9. PCA: Con observaciones discriminadas pro clase. Primero quitamos

outliers para poder ver con mas claridad el biplot.

plot_robust_pca(ds_step_6)

7. Train test split

Partimos el dataset en los conjuntos de entrenamiento y prueba. Ademas antes de partimos ordenamos las observaciones aleatoriamente para que los modelos de clasificación no aprendan secuencia si es que existe alguna en el orden original.

c(raw_train_set, raw_test_set) %<-% train_test_split(
  ds_step_6, 
  train_size = .8, 
  shuffle    = TRUE
)
## [1] "Train Set size: 2913"
## [1] "Test set size: 729"

8. Método SMOTE para balancear el dataset.

# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_set

9. Escalamos las variables numéricas

Restamos la media y dividimos por el desvío.

train_set <- balanced_train_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set  <- raw_test_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))

rm(balanced_train_set)
rm(raw_test_set)

10. Clasificacion

Antes que nada se debe definir el valor de beta para la métrica F beta Score. En este problema debemos tener en cuenta dos factores para decidir cual el mejor valor de beta:

Por las razones ya explicadas necesitamos priorizar el recall por sobre la precisión. Para explicarlo de forma simple, es preferible predecir algun negativos como positivos que predecir perfectamente los positivos a costa de tener falsos negativos.

Finalmente para priorizar el recall elegimos un beta > 1.

beta = 2

10.1. Entrenamos un modelo LDA

lda_model <- lda(formula(Hazardous~.), train_set)
lda_threshold <- search_min_fn_threshold(
  lda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)
plot_cm(lda_test_pred, target(test_set))

plot_roc(lda_test_pred, target(test_set))

fbeta_score(lda_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.622406639004149"
lda_model$scaling
##                                    LD1
## Minimum.Orbit.Intersection -1.25730298
## Absolute.Magnitude         -1.61409347
## Est.Dia.in.Miles.min.      -0.39646592
## Perihelion.Distance         0.08304722
## Inclination                 0.01058829

10.2. Entrenamos un modelo QDA

qda_model <- qda(formula(Hazardous~.), train_set)
qda_threshold <- search_min_fn_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)
plot_cm(qda_test_pred, target(test_set))

plot_roc(qda_test_pred, target(test_set))

fbeta_score(qda_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.904977375565611"

10.3. Entrenamos un modelo RDA

rda_model <- rda(formula(Hazardous~.), train_set)
rda_threshold <- search_min_fn_threshold(
  rda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)
plot_cm(rda_test_pred, target(test_set))

plot_roc(rda_test_pred, target(test_set))

fbeta_score(rda_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.900900900900901"

10.4. Entrenamos un modelo de regresión logística

lr_model <- glm(formula(Hazardous~.), train_set, family=binomial)
lr_threshold <- search_min_fn_threshold(
  lr_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)
plot_cm(lr_test_pred, target(test_set))

fp(lr_test_pred, target(test_set))
## [1] 12
fn(lr_test_pred, target(test_set))
## [1] 15
plot_roc(lr_test_pred, target(test_set))

fbeta_score(lr_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.879396984924623"

10.5. Entrenamos un modelo SVM

svm_model <- svm(formula(Hazardous~.), train_set, kernel = 'radial')
svm_threshold <- search_min_fn_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)
plot_cm(svm_test_pred, target(test_set))

plot_roc(svm_test_pred, target(test_set))

fbeta_score(svm_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.68259385665529"

10.6. Entrenamos un modelo XGBoost

xgboost_model <- xgboost(
 as.matrix(feat(train_set)), 
 target(train_set),
 eta         = 0.2,
 max_depth   = 20,
 nround      = 15000,
 eval_metric = 'logloss',
 objective   = "binary:logistic",
 nthread     = 24,
 verbose     = 0
)
xgb_threshold <- search_min_fn_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)
plot_cm(xgb_test_pred, target(test_set))

plot_roc(xgb_test_pred, target(test_set))

fbeta_score(xgb_test_pred, target(test_set), beta=2)
## [1] " F2Score: 0.986733001658375"

10.7. Comparativa de metricas

10.7.1 Metricas seleccionando el threshould con minimo False Negative

data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.2 Metricas seleccionando el threshould con maximo AUR

lda_threshold <- search_max_aur_threshold(
  lda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

qda_threshold <- search_max_aur_threshold(
  qda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)


rda_threshold <- search_max_aur_threshold(
  rda_model, 
  feat(test_set), 
  target(test_set),
  xda_predict
)

rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

lr_threshold <- search_max_aur_threshold(
  lr_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

svm_threshold <- search_max_aur_threshold(
  svm_model, 
  feat(test_set), 
  target(test_set),
  model_predict
)

svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)


xgb_threshold <- search_max_aur_threshold(
  xgboost_model, 
  feat(test_set), 
  target(test_set),
  xgboost_predict
)

xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)



data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>% 
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

10.7.3 Metricas seleccionando threshould=0.5

lda_threshold  <- 0.5
lda_test_pred  <- xda_predict(lda_model, feat(test_set), lda_threshold)
lda_train_pred <- xda_predict(lda_model, feat(train_set), lda_threshold)

qda_threshold  <- 0.5
qda_test_pred  <- xda_predict(qda_model, feat(test_set), qda_threshold)
qda_train_pred <- xda_predict(qda_model, feat(train_set), qda_threshold)


rda_threshold  <- 0.5
rda_test_pred  <- xda_predict(rda_model, feat(test_set),  rda_threshold)
rda_train_pred <- xda_predict(rda_model, feat(train_set), rda_threshold)

lr_threshold  <- 0.5
lr_test_pred  <- model_predict(lr_model, test_set,  lr_threshold)
lr_train_pred <- model_predict(lr_model, train_set, lr_threshold)

svm_threshold  <- 0.5
svm_test_pred  <- model_predict(svm_model, test_set,  svm_threshold)
svm_train_pred <- model_predict(svm_model, train_set, svm_threshold)

xgb_threshold  <- 0.5
xgb_test_pred  <- xgboost_predict(xgboost_model, feat(test_set),  xgb_threshold)
xgb_train_pred <- xgboost_predict(xgboost_model, feat(train_set), xgb_threshold)
data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred, target(train_set), beta=2, show=F)
  ),
  False.Negative = c(
    fn(lda_test_pred, target(test_set)),
    fn(qda_test_pred, target(test_set)),
    fn(rda_test_pred, target(test_set)),
    fn(lr_test_pred,  target(test_set)),
    fn(svm_test_pred, target(test_set)),
    fn(xgb_test_pred, target(test_set))
  ),
  False.positive = c(
    fp(lda_test_pred, target(test_set)),
    fp(qda_test_pred, target(test_set)),
    fp(rda_test_pred, target(test_set)),
    fp(lr_test_pred,  target(test_set)),
    fp(svm_test_pred, target(test_set)),
    fp(xgb_test_pred, target(test_set))
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    lda_threshold, 
    qda_threshold, 
    rda_threshold, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3)
  ) %>%
  dplyr::select(
    False.Negative,
    False.positive,
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Model,
    Umbral
  ) %>%
  arrange(False.Negative, desc(Test.F2Score.Percent))

11. KMeans Clustering

11.1. Primero definimos cuantos grupos utilizar.

Típica con estimadores de la normal.

scaled_data_1 <- feat(ds_step_6) %>% mutate(~(scale(.) %>% as.vector))
clustering_metrics_plot(scaled_data_1)

Escalamiento diferente de la típica normal.

scaled_data_2 <- apply(
  feat(ds_step_6), 
  2, 
  function(x) { (x - min(x)) / (max(x) - min(x))}
)
clustering_metrics_plot(scaled_data_2)

11.2. Kmeans

n_clusters <- 2
kmeans_model <- kmeans(scaled_data_1, n_clusters)

km_ds_step_6 <- data.frame(ds_step_6)
km_ds_step_6$kmeans <- kmeans_model$cluster

10.2. biplot

# ds_without_outliers <- filter_outliers_m1(km_ds_step_6, max_score=0.5)
ds_without_outliers <- filter_outliers_m2(km_ds_step_6)
## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-kmeans),
  groups = factor(ds_without_outliers$kmeans),
)

11.3 Hierarchical Clustering

Matriz de distancias euclídeas

mat_dist <- dist(x = ds_step_6, method = "euclidean") 

Dendrogramas (según el tipo de segmentación jerárquica aplicada)

hc_complete <- hclust(d = mat_dist, method = "complete") 
hc_average  <- hclust(d = mat_dist, method = "average")
hc_single   <- hclust(d = mat_dist, method = "single")
hc_ward     <- hclust(d = mat_dist, method = "ward.D2")

Calculo del coeficiente de correlación cofenetico

cor(x = mat_dist, cophenetic(hc_complete))
## [1] 0.7663681
cor(x = mat_dist, cophenetic(hc_average))
## [1] 0.7802204
cor(x = mat_dist, cophenetic(hc_single))
## [1] 0.5472188
cor(x = mat_dist, cophenetic(hc_ward))
## [1] 0.5670307

Construcción de un dendograma usando los resultados de la técnica de Ward

n_clusters <- 2

plot(hc_ward)
rect.hclust(hc_ward, k=n_clusters, border="red")

hc_ds <- data.frame(ds_step_6)
hc_ds$her_ward <- cutree(hc_ward, k=n_clusters)
# ds_without_outliers <- filter_outlier_m1(hc_ds, max_score=0.57)
ds_without_outliers <- filter_outliers_m2(hc_ds)
## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-her_ward),
  groups = factor(ds_without_outliers$her_ward),
  colours=c("orange","cyan","blue","magenta","yellow","black"),
  labels=c("grupo 1", "grupo 2","grupo 3","grupo 4","grupo 5","grupo 6")
)

 

Realizado por Adrian Marino

adrianmarino@gmail.com